library(tidyverse)
── Attaching core tidyverse packages ──────────────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.3 ✔ readr 2.1.4
✔ forcats 1.0.0 ✔ stringr 1.5.0
✔ ggplot2 3.4.3 ✔ tibble 3.2.1
✔ lubridate 1.9.3 ✔ tidyr 1.3.0
✔ purrr 1.0.2 ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(plotly)
Attaching package: ‘plotly’
The following object is masked from ‘package:ggplot2’:
last_plot
The following object is masked from ‘package:stats’:
filter
The following object is masked from ‘package:graphics’:
layout
exif <- read_csv("capstone_exif.csv")
Rows: 1116 Columns: 15── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (10): DateTimeOriginal, CreateDate, ModifyDate, Software, LensInfo, LensModel, ExposureTime, FocalLength, FocalLengthIn3...
dbl (5): FlickrID, JFIFVersion, ISO, FNumber, BrightnessValue
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
img_data <- read_csv("capstone_img_data.csv")
Rows: 10955 Columns: 87── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (16): flickr, img_loc, the_image, crop_coords, center_rgb, post_top_hsl, post_2_hsl, post_3_hsl, post_4_hsl, post_5_hsl,...
dbl (70): using_id, img_width, img_height, do_img_at, sub_img, full_id, r_min, r_max, r_mean, r_mode, g_min, g_max, g_mean, ...
num (1): vivid_count
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
spec(exif)
cols(
FlickrID = col_double(),
DateTimeOriginal = col_character(),
CreateDate = col_character(),
ModifyDate = col_character(),
Software = col_character(),
LensInfo = col_character(),
LensModel = col_character(),
JFIFVersion = col_double(),
ISO = col_double(),
ExposureTime = col_character(),
FNumber = col_double(),
FocalLength = col_character(),
FocalLengthIn35mmFormat = col_character(),
BrightnessValue = col_double(),
SubjectArea = col_character()
)
spec(img_data)
cols(
flickr = col_character(),
using_id = col_double(),
img_loc = col_character(),
the_image = col_character(),
img_width = col_double(),
img_height = col_double(),
crop_coords = col_character(),
do_img_at = col_double(),
sub_img = col_double(),
full_id = col_double(),
r_min = col_double(),
r_max = col_double(),
r_mean = col_double(),
r_mode = col_double(),
g_min = col_double(),
g_max = col_double(),
g_mean = col_double(),
g_mode = col_double(),
b_min = col_double(),
b_max = col_double(),
b_mean = col_double(),
b_mode = col_double(),
center_rgb = col_character(),
post_num_regions = col_double(),
post_top_hsl = col_character(),
post_top_count = col_double(),
post_2_hsl = col_character(),
post_2_count = col_double(),
post_3_hsl = col_character(),
post_3_count = col_double(),
post_4_hsl = col_character(),
post_4_count = col_double(),
post_5_hsl = col_character(),
post_5_count = col_double(),
post_6_hsl = col_character(),
post_6_count = col_double(),
center_hsl = col_character(),
full_red_count = col_double(),
visib_red_count = col_double(),
vivid_red_count = col_double(),
full_orange_count = col_double(),
visib_orange_count = col_double(),
vivid_orange_count = col_double(),
full_yellow_count = col_double(),
visib_yellow_count = col_double(),
vivid_yellow_count = col_double(),
full_green_count = col_double(),
visib_green_count = col_double(),
vivid_green_count = col_double(),
full_cyan_count = col_double(),
visib_cyan_count = col_double(),
vivid_cyan_count = col_double(),
full_blue_count = col_double(),
visib_blue_count = col_double(),
vivid_blue_count = col_double(),
full_purple_count = col_double(),
visib_purple_count = col_double(),
vivid_purple_count = col_double(),
full_mag_count = col_double(),
visib_mag_count = col_double(),
vivid_mag_count = col_double(),
vivid_count = col_number(),
sat_min_val = col_double(),
sat_25_val = col_double(),
sat_50_val = col_double(),
sat_75_val = col_double(),
sat_max_val = col_double(),
hue_mean_val = col_double(),
sat_mean_val = col_double(),
light_mean_val = col_double(),
light_max_val = col_double(),
light_max_count = col_double(),
light_min_val = col_double(),
light_min_count = col_double(),
light_25_value = col_double(),
light_50_value = col_double(),
light_75_value = col_double(),
gen_bright_count = col_double(),
gen_dark_count = col_double(),
common_hsl_1_val = col_character(),
common_hsl_1_count = col_double(),
common_hsl_2_val = col_character(),
common_hsl_2_count = col_double(),
common_hsl_3_val = col_character(),
common_hsl_3_count = col_double(),
common_hsl_4_val = col_character(),
common_hsl_4_count = col_double()
)
Pre Import:
Sub-image data for main images (0) was bugged in the first hours of
image processing. This was fixed in Excel during the data collation
stage.
[Editors Note: numeric values looked consistent in Excel and R Studio, but were interpreted as characters, necessitating a large amount of manual re-typing in part 2. At the time it seemed faster than backing up and figuring out the ‘right’ way to fix them]
EXIF:
names(exif) <- gsub("([a-z0-9])([A-Z])", "\\1_\\2", names(exif))
names(exif) <- names(exif) %>% tolower()
exif_tidy <- select(exif, -c(date_time_original, modify_date, lens_info, fnumber, focal_length))
exif_tidy <- replace_na(exif_tidy, list(subject_area = "0 0 0 0", jfifversion = 0))
img_data:
imgsd_tidy <- select(img_data, -c(flickr, img_loc, the_image, img_width, img_height, crop_coords, do_img_at, r_mode, b_mode, g_mode))
imgsd_tidy <- replace_na(imgsd_tidy, list(
post_2_hsl = "(-1, -1, -1)",
post_3_hsl = "(-1, -1, -1)",
post_4_hsl = "(-1, -1, -1)",
post_5_hsl = "(-1, -1, -1)",
post_6_hsl = "(-1, -1, -1)"
)
)
EXIF
exif_tidy <- exif_tidy %>% separate(create_date, into = c('date', 'time'), sep = " ", remove = TRUE) %>% separate(date, into = c('year', 'month', 'day'), sep = ":")
exif_tidy$date <- as.Date(paste("1881", exif_tidy$month, exif_tidy$day, sep = "-"), format ="%Y-%m-%d")
Img_data
Count by flickr id
subimg_qty <- imgsd_tidy %>% count(using_id)
To my (happy) surprise, only 5 images have less than 10 results and only 2 have less than 6. In the interest of time, I’m noting these IDs by hand and simply removing them from my working data
good_ids <- subimg_qty[subimg_qty$n >=6, "using_id"]
imgsd_tidy <- imgsd_tidy %>% filter(using_id %in% good_ids$using_id)
exif_tidy <- exif_tidy %>% filter(flickr_id %in% good_ids$using_id)
imgsd_tidy <- imgsd_tidy %>% mutate(total_pixels = full_red_count +
full_orange_count +
full_yellow_count +
full_green_count +
full_cyan_count +
full_blue_count +
full_purple_count +
full_mag_count)
# remove brackets from string-encapsulated lists
imgsd_tidy$center_hsl <- str_replace(imgsd_tidy$center_hsl, '\\[|\\]', '')
# remove parens from string-encapsulated tuples
imgsd_tidy <- imgsd_tidy %>% mutate_all(~ gsub('\\(|\\)', '', .))
# separate pixel values into individual columns
imgsd_split <- imgsd_tidy %>%
separate(center_rgb,
into = c('center_r', 'center_g', 'center_b'),
sep = ',') %>%
separate(post_top_hsl,
into = c('post_top_hue', 'post_top_sat', 'post_top_light'),
sep = ',') %>%
separate(post_2_hsl,
into = c('post_2_hue', 'post_2_sat', 'post_2_light'),
sep = ',') %>%
separate(post_3_hsl,
into = c('post_3_hue', 'post_3_sat', 'post_3_light'),
sep = ',') %>%
separate(post_4_hsl,
into = c('post_4_hue', 'post_4_sat', 'post_4_light'),
sep = ',') %>%
separate(post_5_hsl,
into = c('post_5_hue', 'post_5_sat', 'post_5_light'),
sep = ',') %>%
separate(post_6_hsl,
into = c('post_6_hue', 'post_6_sat', 'post_6_light'),
sep = ',') %>%
separate(center_hsl,
into = c('center_hue', 'center_sat', 'center_light'),
sep = ',') %>%
separate(common_hsl_1_val,
into = c('common_hsl_1_hue', 'common_hsl_1_sat', 'common_hsl_1_light'),
sep = ',') %>%
separate(common_hsl_2_val,
into = c('common_hsl_2_hue', 'common_hsl_2_sat', 'common_hsl_2_light'),
sep = ',') %>%
separate(common_hsl_3_val,
into = c('common_hsl_3_hue', 'common_hsl_3_sat', 'common_hsl_3_light'),
sep = ',') %>%
separate(common_hsl_4_val,
into = c('common_hsl_4_hue', 'common_hsl_4_sat', 'common_hsl_4_light'),
sep = ',')
… I probably should have saved those independently during the python image processing stage.
To be added as needed:
imgsd_split$common_hsl_4_hue <- as.numeric(imgsd_split$common_hsl_4_hue)
imgsd_split$common_hsl_4_sat <- as.numeric(imgsd_split$common_hsl_4_sat)
fig <- plot_ly(imgsd_split,
x=~common_hsl_4_hue,
y= ~common_hsl_4_sat,
type = "scatter", mode="markers", size = 2, color = ~sub_img, colors = custom_colors)
fig
Warning: Ignoring 2 observationsWarning: Ignoring 2 observations
Observation: Here we can more closely examine the hue trends present in the images. I’m fascinated by the cluster at 105 hue/60-80 saturation which is present in many sub-images but not 3, 9, or 6.
imgsd_split$light_min_val <- as.numeric(imgsd_split$light_min_val)
imgsd_split$light_max_val <- as.numeric(imgsd_split$light_max_val)
fig <- plot_ly(imgsd_split,
x=~light_min_val,
y= ~light_max_val,
type = "scatter", mode="markers",
size = 2, color = ~sub_img, colors = custom_colors)
fig
imgsd_split$sat_min_val <- as.numeric(imgsd_split$sat_min_val)
imgsd_split$sat_max_val <- as.numeric(imgsd_split$sat_max_val)
fig <- plot_ly(imgsd_split,
x=~sat_min_val,
y= ~sat_max_val,
type = "scatter", mode="markers",
size = 2, color = ~sub_img, colors = custom_colors)
fig
NA